import numpy as np
np.random.seed(7)
import tensorflow as tf
tf.__version__
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
import random, re
import time
# used to supress display of warnings
import warnings
import missingno as mno
# nlp libraries
import nltk
nltk.download('punkt')
nltk.download('stopwords')
from nltk.tokenize import word_tokenize
from tqdm import tqdm
from nltk.corpus import stopwords
stop_words = stopwords.words('english')
from sklearn.feature_extraction.text import TfidfVectorizer
import holoviews as hv
from holoviews import opts
import os;
from os import makedirs
# sampling methods
from sklearn.utils import resample
from imblearn.over_sampling import SMOTE
# import zscore for scaling the data
from scipy.stats import zscore
from scipy.stats import randint as sp_randint
# save models
import pickle
# pre-processing methods
from sklearn.model_selection import train_test_split
# the classification models
from sklearn.dummy import DummyClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.linear_model import RidgeClassifier
from sklearn.linear_model import Lasso
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
# ensemble models
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import StackingClassifier
from sklearn.ensemble import BaggingClassifier
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.ensemble import GradientBoostingClassifier
from lightgbm import LGBMClassifier
from xgboost import XGBClassifier
# methods and classes for evaluation
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import StratifiedKFold
from sklearn import metrics
from sklearn.metrics import classification_report
from sklearn.metrics import accuracy_score, f1_score, confusion_matrix, recall_score, precision_score, roc_auc_score
# cross-validation methods
from sklearn.model_selection import KFold
from sklearn.model_selection import RepeatedStratifiedKFold
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV
# feature selection methods
from sklearn.pipeline import Pipeline
from sklearn.decomposition import PCA
from sklearn.decomposition import TruncatedSVD
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import f_classif
from sklearn.feature_selection import mutual_info_classif
from sklearn.feature_selection import RFE
from sklearn.feature_selection import RFECV
# pre-processing methods
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import PowerTransformer
from sklearn.preprocessing import LabelEncoder
## for bag-of-words
from sklearn import feature_extraction, model_selection, naive_bayes, pipeline, manifold, preprocessing
## for explainer
## for word embedding
import gensim
import gensim.downloader as gensim_api
## for deep learning
from tensorflow.keras import models, layers, preprocessing as kprocessing
from tensorflow.keras import backend as K
# Mounting Google Drive
from google.colab import drive
drive.mount('/content/drive')
data = pd.read_csv("/content/drive/MyDrive/Colab Notebooks/Capstone/IHMStefanini_industrial_safety_and_health_database_with_accidents_description.csv")
data.head()
data.info()
#Shape of the data
print("Number of rows = {0} and Number of Columns = {1} in the Data frame".format(data.shape[0], data.shape[1]))
Date column - 'Data'
There are about 425 rows and 11 columns in the dataset.
#Data Cleansing
#Remove 'Unnamed: 0' column from Data frame
data.drop("Unnamed: 0", axis=1, inplace=True)
#Rename 'Data', 'Countries', 'Genre', 'Employee or Third Party' columns in Data frame
data.rename(columns={'Data':'Date', 'Countries':'Country', 'Genre':'Gender', 'Employee or Third Party':'Employee type'}, inplace=True)
#Get the top 3 rows
data.head(3)
# Check duplicates in a data frame
data.duplicated().sum()
#View the duplicate records
duplicates = data.duplicated()
data[duplicates]
#Delete duplicate rows
data.drop_duplicates(inplace=True)
data.shape
print("Number of rows = {0} and Number of Columns = {1} in the Data frame after removing the duplicates.".format(data.shape[0], data.shape[1]))
#Check unique values of all columns except 'Description' column
for x in data.columns:
if x != 'Description':
print('--'*30); print(f'Unique values of "{x}" column'); print('--'*30)
print(data[x].unique())
print('\n')
# Check the presence of missing values
data.isnull().sum()
#Data Pre-processing
data['Date'] = pd.to_datetime(data['Date'])
data['Year'] = data.Date.apply(lambda x : x.year)
data['Month'] = data.Date.apply(lambda x : x.month)
data['Day'] = data.Date.apply(lambda x : x.day)
data['Weekday'] = data.Date.apply(lambda x : x.day_name())
data['WeekofYear'] = data.Date.apply(lambda x : x.weekofyear)
data.head()
EDA
Variable Identification:
#Univariate Analysis
print('--'*30); print('Value Counts for `Country` label'); print('--'*30)
total_row_cnt = data.shape[0]
country_01_cnt = data[data.Country == 'Country_01'].shape[0]
country_02_cnt = data[data.Country == 'Country_02'].shape[0]
country_03_cnt = data[data.Country == 'Country_03'].shape[0]
print(f'Country_01 count: {country_01_cnt} i.e. {round(country_01_cnt/total_row_cnt*100, 0)}%')
print(f'Country_02 count: {country_02_cnt} i.e. {round(country_02_cnt/total_row_cnt*100, 0)}%')
print(f'Country_03 count: {country_03_cnt} i.e. {round(country_03_cnt/total_row_cnt*100, 0)}%')
print('--'*30); print('Distributon of `Country` label'); print('--'*30)
_ = data['Country'].value_counts().plot(kind = 'pie', autopct = '%.0f%%', labels = ['Country_01', 'Country_02', 'Country_03'], figsize = (10, 6))
local_cnt = np.round(data['Local'].value_counts(normalize=True) * 100)
hv.extension('bokeh')
hv.Bars(local_cnt).opts(title="Local Count", color="#8888ff", xlabel="Locals", ylabel="Percentage", yformatter='%d%%')\
.opts(opts.Bars(width=700, height=300,tools=['hover'],show_grid=True))
print('--'*30); print('Value Counts for `Industry Sector` label'); print('--'*30)
Mining_cnt = data[data['Industry Sector'] == 'Mining'].shape[0]
Metals_cnt = data[data['Industry Sector'] == 'Metals'].shape[0]
Others_cnt = data[data['Industry Sector'] == 'Others'].shape[0]
print(f'Mining count: {Mining_cnt} i.e. {round(Mining_cnt/total_row_cnt*100, 0)}%')
print(f'Metals count: {Metals_cnt} i.e. {round(Metals_cnt/total_row_cnt*100, 0)}%')
print(f'Others count: {Others_cnt} i.e. {round(Others_cnt/total_row_cnt*100, 0)}%')
print('--'*30); print('Distributon of `Industry Sector` label'); print('--'*30)
sector_cnt = np.round(data['Industry Sector'].value_counts(normalize=True) * 100)
hv.extension('bokeh')
hv.Bars(sector_cnt).opts(title="Industry Sector Count", color="#8888ff", xlabel="Sectors", ylabel="Percentage", yformatter='%d%%')\
.opts(opts.Bars(width=500, height=300,tools=['hover'],show_grid=True))
print('--'*30); print('Value Counts for `Accident Level` label'); print('--'*40)
I_acc_cnt = data[data['Accident Level'] == 'I'].shape[0]
II_acc_cnt = data[data['Accident Level'] == 'II'].shape[0]
III_acc_cnt = data[data['Accident Level'] == 'III'].shape[0]
IV_acc_cnt = data[data['Accident Level'] == 'IV'].shape[0]
V_acc_cnt = data[data['Accident Level'] == 'V'].shape[0]
VI_acc_cnt = data[data['Accident Level'] == 'VI'].shape[0]
print(f'Accident Level - I count: {I_acc_cnt} i.e. {round(I_acc_cnt/total_row_cnt*100, 0)}%')
print(f'Accident Level - II count: {II_acc_cnt} i.e. {round(II_acc_cnt/total_row_cnt*100, 0)}%')
print(f'Accident Level - III count: {III_acc_cnt} i.e. {round(III_acc_cnt/total_row_cnt*100, 0)}%')
print(f'Accident Level - IV count: {IV_acc_cnt} i.e. {round(IV_acc_cnt/total_row_cnt*100, 0)}%')
print(f'Accident Level - V count: {V_acc_cnt} i.e. {round(V_acc_cnt/total_row_cnt*100, 0)}%')
print(f'Accident Level - VI count: {VI_acc_cnt} i.e. {round(VI_acc_cnt/total_row_cnt*100, 0)}%')
print('--'*30); print('Value Counts for `Potential Accident Level'); print('--'*40)
I_pot_acc_cnt = data[data['Potential Accident Level'] == 'I'].shape[0]
II_pot_acc_cnt = data[data['Potential Accident Level'] == 'II'].shape[0]
III_pot_acc_cnt = data[data['Potential Accident Level'] == 'III'].shape[0]
IV_pot_acc_cnt = data[data['Potential Accident Level'] == 'IV'].shape[0]
V_pot_acc_cnt = data[data['Potential Accident Level'] == 'V'].shape[0]
VI_pot_acc_cnt = data[data['Potential Accident Level'] == 'VI'].shape[0]
print(f'Potential Accident Level - I count: {I_pot_acc_cnt} i.e. {round(I_pot_acc_cnt/total_row_cnt*100, 0)}%')
print(f'Potential Accident Level - II count: {II_pot_acc_cnt} i.e. {round(II_pot_acc_cnt/total_row_cnt*100, 0)}%')
print(f'Potential Accident Level - III count: {III_pot_acc_cnt} i.e. {round(III_pot_acc_cnt/total_row_cnt*100, 0)}%')
print(f'Potential Accident Level - IV count: {IV_pot_acc_cnt} i.e. {round(IV_pot_acc_cnt/total_row_cnt*100, 0)}%')
print(f'Potential Accident Level - V count: {V_pot_acc_cnt} i.e. {round(V_pot_acc_cnt/total_row_cnt*100, 0)}%')
print(f'Potential Accident Level - VI count: {VI_pot_acc_cnt} i.e. {round(VI_pot_acc_cnt/total_row_cnt*100, 0)}%')
print('--'*30); print('Distributon of `Accident Level` & `Potential Accident Level` label'); print('--'*40)
ac_level_cnt = np.round(data['Accident Level'].value_counts(normalize=True) * 100)
pot_ac_level_cnt = np.round(data['Potential Accident Level'].value_counts(normalize=True) * 100, decimals=1)
ac_pot = pd.concat([ac_level_cnt, pot_ac_level_cnt], axis=1,sort=False).fillna(0).rename(columns={'Accident Level':'Accident', 'Potential Accident Level':'Potential'})
ac_pot = pd.melt(ac_pot.reset_index(), ['index']).rename(columns={'index':'Severity', 'variable':'Levels'})
hv.extension('bokeh')
hv.Bars(ac_pot, ['Severity', 'Levels'], 'value').opts(opts.Bars(title="Accident Levels Count", width=700, height=300,tools=['hover'],\
show_grid=True,xrotation=45, ylabel="Percentage", yformatter='%d%%'))
print('--'*30); print('Value Counts for `Gender` label'); print('--'*30)
Male_cnt = data[data['Gender'] == 'Male'].shape[0]
Female_cnt = data[data['Gender'] == 'Female'].shape[0]
print(f'Male count: {Male_cnt} i.e. {round(Male_cnt/total_row_cnt*100, 0)}%')
print(f'Female count: {Female_cnt} i.e. {round(Female_cnt/total_row_cnt*100, 0)}%')
print('--'*30); print('Distributon of `Gender` label'); print('--'*30)
gender_cnt = np.round(data['Gender'].value_counts(normalize=True) * 100)
hv.extension('bokeh')
hv.Bars(gender_cnt).opts(title="Gender Count", color="#8888ff", xlabel="Gender", ylabel="Percentage", yformatter='%d%%')\
.opts(opts.Bars(width=500, height=300,tools=['hover'],show_grid=True))
print('--'*30); print('Value Counts for `Employee type` label'); print('--'*30)
third_party_cnt = data[data['Employee type'] == 'Third Party'].shape[0]
emp_cnt = data[data['Employee type'] == 'Employee'].shape[0]
third_rem_cnt = data[data['Employee type'] == 'Third Party (Remote)'].shape[0]
print(f'Third Party count: {third_party_cnt} i.e. {round(third_party_cnt/total_row_cnt*100, 0)}%')
print(f'Employee count: {emp_cnt} i.e. {round(emp_cnt/total_row_cnt*100, 0)}%')
print(f'Third Party (Remote) count: {third_rem_cnt} i.e. {round(third_rem_cnt/total_row_cnt*100, 0)}%')
print('--'*30); print('Distributon of `Employee type` label'); print('--'*30)
emp_type_cnt = np.round(data['Employee type'].value_counts(normalize=True) * 100)
hv.extension('bokeh')
hv.Bars(emp_type_cnt).opts(title="Employee type Count", color="#8888ff", xlabel="Employee Type", ylabel="Percentage", yformatter='%d%%')\
.opts(opts.Bars(width=500, height=300,tools=['hover'],show_grid=True))
cr_risk_cnt = np.round(data['Critical Risk'].value_counts(normalize=True) * 100)
hv.extension('bokeh')
hv.Bars(cr_risk_cnt[::-1]).opts(title="Critical Risk Count", color="#8888ff", xlabel="Critical Risks", ylabel="Percentage", xformatter='%d%%')\
.opts(opts.Bars(width=600, height=600,tools=['hover'],show_grid=True,invert_axes=True))
#Checking the proportion of Industry sector in different countries
indsec_cntry_table = pd.crosstab(index = data['Industry Sector'], columns = data['Country'])
indsec_cntry_table.plot(kind = 'bar', figsize=(8,8), stacked = True)
plt.title("Proportion of Industry Sector in different countries")
plt.show()
NLP preprocessing
# Checking 5 random Descriptions and accident_levels from the data where the length of headline is > 100;
indexes = list(data.loc[data['Description'].str.len() > 100, 'Description'].index)
rands = random.sample(indexes, 5)
descriptions, accident_levels = list(data.loc[rands, 'Description']), list(data.loc[rands, 'Accident Level'])
print('--'*40); print('Distributon of accident_level where the length of Description is > 100'); print('--'*40)
_ = data.loc[indexes, 'Accident Level'].value_counts().plot(kind = 'pie', autopct = '%.0f%%', labels = ['I', 'II', 'III', 'IV', 'V'], figsize = (10, 6))
#Converting description to lower case
data['Cleaned_Description'] = data['Description'].apply(lambda x : x.lower())
#library that contains punctuation
import string
string.punctuation
#defining the function to remove punctuation
def remove_punctuation(text):
punctuationfree="".join([i for i in text if i not in string.punctuation])
return punctuationfree
#storing the puntuation free text
data['Cleaned_Description']= data['Cleaned_Description'].apply(lambda x:remove_punctuation(x))
data['Cleaned_Description'] = data['Cleaned_Description'].apply(lambda x: re.sub(' +', ' ', x))
#remove stopwords
from nltk.corpus import stopwords
stop_words = set(stopwords.words('english'))
stop_words.add('subject')
stop_words.add('http')
def remove_stopwords(text):
return " ".join([word for word in str(text).split() if word not in stop_words])
data['Cleaned_Description'] = data['Cleaned_Description'].apply(lambda x: remove_stopwords(x))
import nltk
nltk.download('wordnet')
#defining the function for lemmatization
from nltk.stem import WordNetLemmatizer
lemmatizer = WordNetLemmatizer()
def lemmatize_words(text):
return " ".join([lemmatizer.lemmatize(word) for word in text.split()])
data['Cleaned_Description'] = data['Cleaned_Description'].apply(lambda text: lemmatize_words(text))
print('Get the length of each line, find the maximum length and print the maximum length line');
print('Length of line ranges from 64 to 672.'); print('--'*45)
# Get length of each line
data['line_length'] = data['Cleaned_Description'].str.len()
print('Minimum line length: {}'.format(data['line_length'].min()))
print('Maximum line length: {}'.format(data['line_length'].max()))
print('Line with maximum length: {}'.format(data[data['line_length'] == data['line_length'].max()]['Cleaned_Description'].values[0]))
print('Get the number of words, find the maximum number of words and print the maximum number of words');
print('Number of words ranges from 10 to 98.'); print('--'*45)
# Get length of each line
data['nb_words'] = data['Cleaned_Description'].apply(lambda x: len(x.split(' ')))
print('Minimum number of words: {}'.format(data['nb_words'].min()))
print('Maximum number of words: {}'.format(data['nb_words'].max()))
print('Line with maximum number of words: {}'.format(data[data['nb_words'] == data['nb_words'].max()]['Cleaned_Description'].values[0]))
from wordcloud import WordCloud, STOPWORDS, ImageColorGenerator
desc = data['Cleaned_Description']
wordcloud = WordCloud(width = 1500, height = 800, random_state = 1, background_color='black', min_font_size=5, max_words=300, collocations=False).generate(str(desc))
plt.figure(figsize=(15,10))
plt.imshow(wordcloud)
plt.axis('off')
plt.show()
models_list_accidentlevel = ["tf_vectorizer with naive_bayes","tf_vectorizer with SVC","cv_vectorizer with KNeighborsClassifier","cv_vectorizer with SVC",
"tf_vectorizer with KNeighborsClassifier"]
acc1 = 80
acc2 = 80
acc3 = 80
acc4 = 78
acc5 = 78
accuracy_accidentlevel = [acc1, acc2, acc3, acc4, acc5]
df_acc = pd.DataFrame(list(zip(models_list_accidentlevel, accuracy_accidentlevel)))
df_acc.columns =['models_list_accidentlevel', 'accuracy_accidentlevel']
df_acc
import matplotlib.pyplot as plt
y=["tf_vectorizer with naive_bayes","tf_vectorizer with SVC","cv_vectorizer with KNeighborsClassifier","cv_vectorizer with SVC",
"tf_vectorizer with KNeighborsClassifier"]
# getting values against each value of y
x=[acc1, acc2, acc3, acc4, acc5]
plt.barh(y, x)
# setting label of y-axis
plt.ylabel("Models to predict Accident level")
# setting label of x-axis
plt.xlabel("Accuracy")
plt.title("Comparison of accuracies of Top 5 models for Accident level")
plt.show()
In comparison of all the above models for target label Accident level, we can say that LSTM with Glove embedding and BiLSTM with Glove embedding are having better accuracies than others.